In [68]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
import os
%matplotlib inline

The word topic probability matrix


In [69]:
Pword = np.zeros((3,10))
Pword[0][0] = 0.2
Pword[0][1] = 0.2
Pword[0][2] = 0.5
Pword[0][9] = 0.1

Pword[1][3] = 0.3
Pword[1][4] = 0.3
Pword[1][5] = 0.3
Pword[1][9] = 0.1

Pword[2][6] = 0.2
Pword[2][7] = 0.3
Pword[2][8] = 0.4
Pword[2][9] = 0.1
Pword


Out[69]:
array([[ 0.2,  0.2,  0.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0.1],
       [ 0. ,  0. ,  0. ,  0.3,  0.3,  0.3,  0. ,  0. ,  0. ,  0.1],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0.2,  0.3,  0.4,  0.1]])

In [70]:
num_topics = 3
num_words = 10

plt.figure(1, figsize=(10,10))
colors = iter(cm.rainbow(np.linspace(0, 1, num_topics+1)))
word_ids = range(num_words)

for k in range(num_topics):
    plt.bar(word_ids, Pword[k], color=next(colors), alpha=0.4, edgecolor = "none")
    
plt.xlim(0, num_words)
plt.ylim(0,1)
plt.show()


Generate documents


In [86]:
alpha = (0.1, 0.1, 0.1)
# alpha = (1.0, 1.0, 1.0)
numDocs = 1000
numWords = 10

corpus_path = os.path.join(os.getcwd(), "..", "datasets", "artificial.dat")
fo = open(corpus_path, "w")

for d in range(numDocs):
    topics = np.random.dirichlet(alpha, 1)
    word_counts = np.zeros(10)
    for w in range(numWords):
        word_topic = np.random.multinomial(1, topics[0].round(2), size=1)
        topic =  word_topic[0].nonzero()[0][0]
        word = np.random.multinomial(1, Pword[topic], size=1)[0].nonzero()[0][0]
        word_counts[word] = word_counts[word] + 1
    
    doc_line = str(len(word_counts.nonzero()[0])) + " "
    for w in range(10):
        if(word_counts[w] != 0):
            doc_line += str(w) + ":" + str(int(word_counts[w])) + " "
    if(d != numDocs - 1):
        doc_line += "\n"
    fo.write(doc_line)
fo.close()

Load the inferred Pword


In [ ]:
path = os.path.join(os.getcwd(), "..", "param", "artificial", "beta.dat")
beta = pd.read_table(path, sep=" ", header = None)
beta = beta.div(beta.sum(1),0)

plt.figure(1, figsize=(10,10))
colors = iter(cm.rainbow(np.linspace(0, 1, num_topics+1)))
word_ids = range(num_words)

for k in range(num_topics):
    plt.bar(word_ids, beta.loc[k].values, color=next(colors), alpha=0.4, edgecolor = "none")
    
plt.xlim(0, num_words)
plt.ylim(0,1)
plt.show()

In [83]:
beta.loc[1]


Out[83]:
0    0.032670
1    0.008945
2    0.032533
3    0.116847
4    0.023655
5    0.085884
6    0.185510
7    0.244405
8    0.269551
Name: 1, dtype: float64

In [47]:
(Pword - beta).abs().sum(1)/10


Out[47]:
0    0.151708
1    0.006457
2    0.180131
dtype: float64

In [ ]: